R Markdown

Objective:

GSE151158,GSE58979,GSE63067,GSE89632 are merged and split into training and testing data for the random forest.With these dataset the parameters for the random forest were tuned.GSM836223 is used for validating the model.

Loading the library and dataset

library(randomForest)
library(caret)
library(e1071)
library(alookr)
library(dplyr)
library(pROC)
library(ggplot2)
library(ggpubr)
library(ggsignif)
library(enrichR)
library(DT)
library(corrplot)
library(dplyr)
library(tibble)

load the data

data<-read.csv("dataset_model.csv",header = T,row.names = 1)

validation<-read.csv("validation.csv",header = T,row.names = 1)



data_scaled<-scale(data[,2:ncol(data)])

data_scaled<-as.data.frame(data_scaled)

data_scaled<-add_column(data_scaled, sample=data$sample, .before = 1)

data<-data_scaled

#write.csv(data,"significant_genes.csv")

head(data)
##               sample         BTK      PTPN6   SERPING1        BAX      ITGAE
## GSM2385720 steatosis  1.08127960  0.1322743 -1.2408517 -0.9896539 1.68591912
## GSM2385722 steatosis  0.38673710 -1.7828508  0.1708034 -0.0515581 1.89125923
## GSM2385723 steatosis -2.03642411 -1.8695068  0.8112607 -1.2316064 0.07979388
## GSM2385724 steatosis  0.36924723 -0.6968099 -0.9745863 -1.1414650 1.47989525
## GSM2385725 steatosis  0.01993391 -0.1333171 -1.1731216 -1.1803881 1.04056815
## GSM2385726 steatosis  3.08558805  0.5342087 -2.8857307 -1.1289022 2.20415261
##                  IL1RAP       MSR1  TNFRSF14       IL15        B2M       TLR1
## GSM2385720  0.327926095  0.4027924 0.3160169  1.9404703 -0.3462406  2.8056676
## GSM2385722  0.145337497  1.2519478 0.4940662  1.2815957 -0.1870264  1.2717048
## GSM2385723 -1.460963433 -0.2745521 0.9335067 -0.3406963 -0.2991018 -1.8724132
## GSM2385724 -0.352411648  0.6069674 1.2074779  1.6431957 -0.2961883  1.7921221
## GSM2385725 -0.504099484  0.5650986 1.2900667  0.7437252 -0.3307479  0.6245865
## GSM2385726 -0.002064212  0.6567890 0.6967889  3.2933405  0.8288701  3.6738902
##                 HPRT1     CX3CR1     TOLLIP         C9      IFIH1      GAPDH
## GSM2385720  0.1060220  1.2821324 -1.1293317  0.7533798  0.9998472 0.68726695
## GSM2385722  0.4387493  0.6580390  0.2812988  0.2609796  1.0355334 0.53900779
## GSM2385723 -0.8278678 -0.2589630  1.4676221 -0.3615333 -0.3201400 0.46547039
## GSM2385724 -0.8597843  1.2250265 -0.1890598  0.2922215  1.2900300 0.07447177
## GSM2385725 -1.2341933  0.8924771  0.7710285  0.2947479  0.7312412 0.13997290
## GSM2385726  0.1581342  1.7473101 -2.0158677 -0.2809214  1.2425143 0.86849115
##                  C4BPA
## GSM2385720  0.46693864
## GSM2385722  0.22515669
## GSM2385723 -0.20885698
## GSM2385724 -0.12497344
## GSM2385725  0.02156057
## GSM2385726 -0.25109679
summary(data)
##     sample               BTK               PTPN6             SERPING1      
##  Length:143         Min.   :-3.76807   Min.   :-2.40889   Min.   :-2.8857  
##  Class :character   1st Qu.:-0.63367   1st Qu.:-0.68114   1st Qu.:-0.6324  
##  Mode  :character   Median : 0.01993   Median : 0.09588   Median : 0.1449  
##                     Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000  
##                     3rd Qu.: 0.61370   3rd Qu.: 0.74178   3rd Qu.: 0.6071  
##                     Max.   : 3.08559   Max.   : 2.85415   Max.   : 2.4024  
##       BAX              ITGAE              IL1RAP               MSR1          
##  Min.   :-2.5414   Min.   :-3.21674   Min.   :-3.428031   Min.   :-3.406505  
##  1st Qu.:-0.6528   1st Qu.:-0.64122   1st Qu.:-0.506883   1st Qu.:-0.635337  
##  Median :-0.0618   Median :-0.04877   Median :-0.002064   Median : 0.003534  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.000000   Mean   : 0.000000  
##  3rd Qu.: 0.8257   3rd Qu.: 0.64536   3rd Qu.: 0.472827   3rd Qu.: 0.639954  
##  Max.   : 2.2700   Max.   : 2.96227   Max.   : 2.716055   Max.   : 2.452036  
##     TNFRSF14             IL15              B2M               TLR1         
##  Min.   :-3.89862   Min.   :-3.1023   Min.   :-2.5241   Min.   :-1.94243  
##  1st Qu.:-0.54319   1st Qu.:-0.5648   1st Qu.:-0.5197   1st Qu.:-0.62100  
##  Median : 0.06174   Median :-0.1349   Median :-0.1119   Median : 0.02295  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.64320   3rd Qu.: 0.5188   3rd Qu.: 0.4584   3rd Qu.: 0.47069  
##  Max.   : 2.21249   Max.   : 3.2933   Max.   : 3.3227   Max.   : 3.67389  
##      HPRT1              CX3CR1             TOLLIP               C9          
##  Min.   :-5.26452   Min.   :-3.43118   Min.   :-3.57206   Min.   :-3.54306  
##  1st Qu.:-0.60502   1st Qu.:-0.49653   1st Qu.:-0.53748   1st Qu.:-0.62527  
##  Median : 0.05058   Median : 0.08593   Median : 0.08156   Median : 0.03723  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.49474   3rd Qu.: 0.67503   3rd Qu.: 0.46771   3rd Qu.: 0.56708  
##  Max.   : 2.44572   Max.   : 2.05143   Max.   : 3.18755   Max.   : 4.98057  
##      IFIH1              GAPDH              C4BPA        
##  Min.   :-6.36339   Min.   :-2.65233   Min.   :-1.7778  
##  1st Qu.:-0.38866   1st Qu.:-0.60569   1st Qu.:-0.5082  
##  Median :-0.01679   Median : 0.05987   Median : 0.1222  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.47957   3rd Qu.: 0.53692   3rd Qu.: 0.4495  
##  Max.   : 4.35314   Max.   : 5.06687   Max.   : 4.9840
validation_scaled<-scale(validation[,2:ncol(validation)])

validation_scaled<-as.data.frame(validation_scaled)

validation_scaled<-add_column(validation_scaled, sample=validation$sample, .before = 1)

validation<-validation_scaled

head(validation)
##              sample      GAPDH         BTK      PTPN6   SERPING1         BAX
## GSM836223 steatosis 0.02387721 -1.33068224 -0.6623408  0.5684840  1.05387898
## GSM836224 steatosis 2.24668729  0.02214979  0.4669600  1.1815302 -0.86003240
## GSM836225 steatosis 0.12327816 -0.54412761 -0.4170409 -0.6543308 -0.27697749
## GSM836227 steatosis 0.10458447 -0.69820634 -0.2269735 -0.6983119 -0.68515513
## GSM836237 steatosis 0.71130467 -1.18222884  0.3959176 -0.7417644 -0.09105392
## GSM836238 steatosis 0.51901991 -1.47829753 -0.1557869 -0.5699974 -1.96562297
##                ITGAE     IL1RAP       MSR1   TNFRSF14       IL15        B2M
## GSM836223 -1.9535168  0.8583664 -0.7309989 -1.7229685 -2.4391517 -1.3206688
## GSM836224 -0.9180950 -1.5220980 -0.9279548  1.8403154 -0.8057021 -0.7162281
## GSM836225 -0.9199808 -1.1769914 -1.0773319  0.3327671 -0.3366715 -1.1009666
## GSM836227 -0.4511072 -1.1910689 -0.6349280  1.2523891  0.3247614 -0.2748755
## GSM836237 -0.9658711 -0.6357247 -1.1750510  0.6396480  1.0945745 -0.6372185
## GSM836238 -0.8189395 -0.3916954 -0.9834789  0.8400943  0.8760299 -0.3910408
##                 TLR1      HPRT1      CX3CR1     TOLLIP         C9      IFIH1
## GSM836223 -1.5661492 -0.1520123 -1.25637577 -0.2204031 -0.7322845 -1.7301328
## GSM836224 -1.4572980 -1.9667093  0.07733541  1.9103593  0.2410451 -1.1699885
## GSM836225 -1.0289664  0.5494917 -0.95228147  0.0251141 -0.2698430 -0.2242031
## GSM836227 -1.0332409 -0.4032159 -0.24575007  1.0317068 -1.0914814 -0.5475796
## GSM836237 -1.2092413 -0.4771169 -0.84695676  1.2561434 -0.7836355 -0.5628154
## GSM836238 -0.9128578 -0.4025952 -2.29127661  1.4100057 -0.3789646 -1.3749545
##                C4BPA
## GSM836223 -0.6285687
## GSM836224  0.2540964
## GSM836225 -0.8100016
## GSM836227 -2.0302002
## GSM836237 -0.7502680
## GSM836238 -0.1887388
summary(validation)
##     sample              GAPDH              BTK              PTPN6        
##  Length:32          Min.   :-1.2385   Min.   :-1.8163   Min.   :-1.4473  
##  Class :character   1st Qu.:-0.6825   1st Qu.:-0.8423   1st Qu.:-0.7291  
##  Mode  :character   Median :-0.1798   Median : 0.2320   Median :-0.1914  
##                     Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##                     3rd Qu.: 0.3368   3rd Qu.: 0.6098   3rd Qu.: 0.8136  
##                     Max.   : 3.1212   Max.   : 2.4661   Max.   : 1.8934  
##     SERPING1            BAX              ITGAE             IL1RAP        
##  Min.   :-1.4424   Min.   :-1.9656   Min.   :-1.9535   Min.   :-1.73477  
##  1st Qu.:-0.7310   1st Qu.:-0.6289   1st Qu.:-0.8420   1st Qu.:-0.67820  
##  Median :-0.3446   Median :-0.1944   Median :-0.1155   Median : 0.06399  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.5226   3rd Qu.: 0.4022   3rd Qu.: 0.7614   3rd Qu.: 0.84223  
##  Max.   : 2.1967   Max.   : 3.1260   Max.   : 2.3499   Max.   : 2.26158  
##       MSR1            TNFRSF14            IL15              B2M         
##  Min.   :-2.0704   Min.   :-1.7306   Min.   :-2.4392   Min.   :-2.6052  
##  1st Qu.:-0.7531   1st Qu.:-0.7266   1st Qu.:-0.7307   1st Qu.:-0.7810  
##  Median :-0.1188   Median :-0.2578   Median : 0.3370   Median : 0.1506  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5935   3rd Qu.: 0.8701   3rd Qu.: 0.6594   3rd Qu.: 0.6504  
##  Max.   : 2.0162   Max.   : 1.8403   Max.   : 1.6772   Max.   : 2.1734  
##       TLR1             HPRT1              CX3CR1             TOLLIP         
##  Min.   :-1.5661   Min.   :-1.96671   Min.   :-2.29128   Min.   :-1.943810  
##  1st Qu.:-0.8440   1st Qu.:-0.44580   1st Qu.:-0.76524   1st Qu.:-0.726506  
##  Median :-0.1844   Median :-0.05742   Median : 0.01552   Median :-0.001058  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.8096   3rd Qu.: 0.60101   3rd Qu.: 0.67865   3rd Qu.: 0.587332  
##  Max.   : 2.1114   Max.   : 2.19022   Max.   : 2.02822   Max.   : 1.947853  
##        C9              IFIH1              C4BPA        
##  Min.   :-1.6159   Min.   :-1.73013   Min.   :-2.0302  
##  1st Qu.:-0.7757   1st Qu.:-0.85155   1st Qu.:-0.6590  
##  Median :-0.2511   Median : 0.01446   Median : 0.1429  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.4430   3rd Qu.: 0.78302   3rd Qu.: 0.5611  
##  Max.   : 2.9267   Max.   : 1.83986   Max.   : 1.9394

lets split the dataset

sb <- data %>%
  split_by(sample, seed = 6534)

attr_names <- names(attributes(sb))

sb_attr <- attributes(sb)

observe how the target category is separated

sb %>%
  compare_target_category()
## NULL

plot the sample

sb %>%
  compare_plot("sample")

compare the variables in the train and test set

sb %>%
  compare_target_numeric()
## # A tibble: 18 x 7
##    variable train_mean test_mean train_sd test_sd  train_z   test_z
##    <chr>         <dbl>     <dbl>    <dbl>   <dbl>    <dbl>    <dbl>
##  1 B2M        -0.00973   0.0226     0.966   1.09  -0.0101   0.0209 
##  2 BAX         0.108    -0.251      0.935   1.11   0.116   -0.227  
##  3 BTK         0.0617   -0.144      0.961   1.08   0.0642  -0.133  
##  4 C4BPA      -0.0566    0.132      0.729   1.45  -0.0776   0.0908 
##  5 C9         -0.0315    0.0734     0.941   1.13  -0.0335   0.0647 
##  6 CX3CR1     -0.00147   0.00343    0.991   1.03  -0.00149  0.00332
##  7 GAPDH       0.0469   -0.109      1.00    1.00   0.0469  -0.109  
##  8 HPRT1      -0.0198    0.0459     0.946   1.13  -0.0209   0.0408 
##  9 IFIH1       0.0157   -0.0364     1.05    0.889  0.0149  -0.0409 
## 10 IL15        0.0214   -0.0498     1.00    1.00   0.0213  -0.0497 
## 11 IL1RAP     -0.0370    0.0860     0.965   1.09  -0.0383   0.0792 
## 12 ITGAE       0.0287   -0.0667     0.983   1.05   0.0292  -0.0637 
## 13 MSR1        0.0208   -0.0484     0.996   1.02   0.0209  -0.0475 
## 14 PTPN6       0.108    -0.251      0.969   1.04   0.112   -0.242  
## 15 SERPING1    0.0187   -0.0434     0.991   1.03   0.0189  -0.0421 
## 16 TLR1        0.0545   -0.127      1.04    0.894  0.0523  -0.142  
## 17 TNFRSF14   -0.0377    0.0877     0.998   1.01  -0.0378   0.0869 
## 18 TOLLIP     -0.0326    0.0759     1.04    0.898 -0.0313   0.0845

Extract the train and test dataset

train <- sb %>%
  extract_set(set = "train")

test <- sb %>%
  extract_set(set = "test")

Random Forest

set up the control .

In this case the repeated 10 cross validation approach is applied which gets repeated for 5 times

trControl <- trainControl(method = "repeatedcv",
                          number = 10,
                          repeats = 5,
                          search = "grid",
                          summaryFunction=twoClassSummary,
                          classProbs=TRUE,
                          savePredictions = T)

build the model with default values initially and observe the results

set.seed(1234)

#Run the model

rf_default <- train(sample~.,
                    data = train,
                    method = "rf",
                    metric = "ROC",
                    trControl = trControl)
# Print the results

print(rf_default)
## Random Forest 
## 
## 100 samples
##  18 predictor
##   2 classes: 'control', 'steatosis' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 89, 90, 90, 90, 90, 90, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.8511706  0.5850000  0.9171429
##   10    0.8673810  0.6200000  0.8976190
##   18    0.8609921  0.6383333  0.8723810
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.

Now lets extract the best mtry

set.seed(1234)

tuneGrid <- expand.grid(.mtry = c(1: 15))

rf_mtry <- train(sample~.,
                 data = train,
                 method = "rf",
                 metric = "ROC",
                 tuneGrid = tuneGrid,
                 trControl = trControl,
                 importance = TRUE,
                 nodesize = 14,
                 ntree = 50)
print(rf_mtry)
## Random Forest 
## 
## 100 samples
##  18 predictor
##   2 classes: 'control', 'steatosis' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 89, 90, 90, 90, 90, 90, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    1    0.8454167  0.5550000  0.9052381
##    2    0.8245238  0.6100000  0.8942857
##    3    0.8417857  0.6250000  0.8923810
##    4    0.8472619  0.5933333  0.8880952
##    5    0.8494246  0.6050000  0.8957143
##    6    0.8428770  0.5950000  0.8728571
##    7    0.8441865  0.6050000  0.8795238
##    8    0.8415476  0.6233333  0.8938095
##    9    0.8426786  0.5950000  0.8766667
##   10    0.8497619  0.6200000  0.8885714
##   11    0.8449802  0.6383333  0.8733333
##   12    0.8532540  0.6266667  0.8604762
##   13    0.8500595  0.6500000  0.8733333
##   14    0.8388889  0.6150000  0.8733333
##   15    0.8384722  0.6366667  0.8542857
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
plot(rf_mtry)

#store the mtry to compare it with the other variable which we are going to 

best_mtry <- rf_mtry$bestTune$mtry 

print(best_mtry)
## [1] 12

next lets train the maxnodes

set.seed(1234)

store_maxnode <- list()
tuneGrid <- expand.grid(.mtry = best_mtry)
for (maxnodes in c(5:15)) {
  set.seed(1234)
  rf_maxnode <- train(sample~.,
                      data = train,
                      method = "rf",
                      metric = "ROC",
                      tuneGrid = tuneGrid,
                      trControl = trControl,
                      importance = TRUE,
                      nodesize = 14,
                      maxnodes = maxnodes,
                      ntree = 50)
  current_iteration <- toString(maxnodes)
  store_maxnode[[current_iteration]] <- rf_maxnode
}
results_mtry <- resamples(store_maxnode)

summary(results_mtry)
## 
## Call:
## summary.resamples(object = results_mtry)
## 
## Models: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 
## Number of resamples: 50 
## 
## ROC 
##         Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## 5  0.6875000 0.7857143 0.8660714 0.8665476 0.9627976    1    0
## 6  0.6458333 0.7544643 0.8750000 0.8570238 0.9568452    1    0
## 7  0.6666667 0.7500000 0.8333333 0.8514683 0.9635417    1    0
## 8  0.6190476 0.7703373 0.8437500 0.8516865 0.9427083    1    0
## 9  0.6250000 0.7544643 0.8333333 0.8516865 0.9739583    1    0
## 10 0.6250000 0.7500000 0.8333333 0.8402579 0.9613095    1    0
## 11 0.6250000 0.7500000 0.8333333 0.8453175 0.9754464    1    0
## 12 0.6250000 0.7500000 0.8333333 0.8462698 0.9754464    1    0
## 13 0.6250000 0.7500000 0.8333333 0.8462698 0.9754464    1    0
## 14 0.6250000 0.7500000 0.8333333 0.8462698 0.9754464    1    0
## 15 0.6250000 0.7500000 0.8333333 0.8462698 0.9754464    1    0
## 
## Sens 
##    Min. 1st Qu.    Median      Mean 3rd Qu. Max. NA's
## 5  0.00     0.5 0.6666667 0.6416667  0.7500    1    0
## 6  0.00     0.5 0.6666667 0.6400000  0.7500    1    0
## 7  0.25     0.5 0.6666667 0.6500000  0.7500    1    0
## 8  0.00     0.5 0.6666667 0.6500000  0.7500    1    0
## 9  0.25     0.5 0.6666667 0.6500000  0.7500    1    0
## 10 0.25     0.5 0.6666667 0.6316667  0.7500    1    0
## 11 0.25     0.5 0.6666667 0.6500000  0.9375    1    0
## 12 0.00     0.5 0.6666667 0.6366667  0.7500    1    0
## 13 0.00     0.5 0.6666667 0.6366667  0.7500    1    0
## 14 0.00     0.5 0.6666667 0.6366667  0.7500    1    0
## 15 0.00     0.5 0.6666667 0.6366667  0.7500    1    0
## 
## Spec 
##         Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
## 5  0.5000000 0.8333333 0.8571429 0.8790476       1    1    0
## 6  0.5000000 0.8333333 0.8571429 0.8761905       1    1    0
## 7  0.5000000 0.8333333 0.8571429 0.8823810       1    1    0
## 8  0.5000000 0.8333333 0.9285714 0.8819048       1    1    0
## 9  0.5000000 0.8333333 0.8571429 0.8795238       1    1    0
## 10 0.3333333 0.8333333 0.8571429 0.8704762       1    1    0
## 11 0.3333333 0.8333333 0.8571429 0.8766667       1    1    0
## 12 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 13 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 14 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 15 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0

lets try increasing the node and see if there is any change.

set.seed(1234)

store_maxnode <- list()
tuneGrid <- expand.grid(.mtry = best_mtry)
for (maxnodes in c(20: 30)) {
  set.seed(1234)
  rf_maxnode <- train(sample~.,
                      data = train,
                      method = "rf",
                      metric = "ROC",
                      tuneGrid = tuneGrid,
                      trControl = trControl,
                      importance = TRUE,
                      nodesize = 14,
                      maxnodes = maxnodes,
                      ntree = 50)
  key <- toString(maxnodes)
  store_maxnode[[key]] <- rf_maxnode
}
results_node <- resamples(store_maxnode)

summary(results_node)
## 
## Call:
## summary.resamples(object = results_node)
## 
## Models: 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 
## Number of resamples: 50 
## 
## ROC 
##     Min. 1st Qu.    Median      Mean   3rd Qu. Max. NA's
## 20 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 21 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 22 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 23 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 24 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 25 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 26 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 27 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 28 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 29 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 30 0.625    0.75 0.8333333 0.8462698 0.9754464    1    0
## 
## Sens 
##    Min. 1st Qu.    Median      Mean 3rd Qu. Max. NA's
## 20    0     0.5 0.6666667 0.6366667    0.75    1    0
## 21    0     0.5 0.6666667 0.6366667    0.75    1    0
## 22    0     0.5 0.6666667 0.6366667    0.75    1    0
## 23    0     0.5 0.6666667 0.6366667    0.75    1    0
## 24    0     0.5 0.6666667 0.6366667    0.75    1    0
## 25    0     0.5 0.6666667 0.6366667    0.75    1    0
## 26    0     0.5 0.6666667 0.6366667    0.75    1    0
## 27    0     0.5 0.6666667 0.6366667    0.75    1    0
## 28    0     0.5 0.6666667 0.6366667    0.75    1    0
## 29    0     0.5 0.6666667 0.6366667    0.75    1    0
## 30    0     0.5 0.6666667 0.6366667    0.75    1    0
## 
## Spec 
##         Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
## 20 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 21 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 22 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 23 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 24 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 25 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 26 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 27 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 28 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 29 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0
## 30 0.3333333 0.8333333 0.8571429 0.8738095       1    1    0

extract the best number of trees and identify the best features for the random forest.

set.seed(1234)

store_maxtrees <- list()
for (ntree in c(10,15,20,25,30,35,40,45,50,55,100,150,200,250, 300,350,400)) {
  set.seed(5678)
  rf_maxtrees <- train(sample~.,
                       data = train,
                       method = "rf",
                       metric = "ROC",
                       tuneGrid = tuneGrid,
                       trControl = trControl,
                       importance = TRUE,
                       nodesize = 14,
                       maxnodes = 5,
                       ntree = ntree)
  key <- toString(ntree)
  store_maxtrees[[key]] <- rf_maxtrees
}

results_tree <- resamples(store_maxtrees)

summary(results_tree)
## 
## Call:
## summary.resamples(object = results_tree)
## 
## Models: 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 100, 150, 200, 250, 300, 350, 400 
## Number of resamples: 50 
## 
## ROC 
##          Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## 10  0.2500000 0.7135417 0.8680556 0.8093254 0.9444444    1    0
## 15  0.2500000 0.7232143 0.8680556 0.8130357 0.9375000    1    0
## 20  0.3333333 0.7313988 0.8888889 0.8320635 0.9459325    1    0
## 25  0.3888889 0.7797619 0.8750000 0.8325198 0.9444444    1    0
## 30  0.4166667 0.7797619 0.8819444 0.8406746 0.9553571    1    0
## 35  0.4166667 0.7413194 0.8750000 0.8357341 0.9459325    1    0
## 40  0.4166667 0.7725694 0.8750000 0.8381349 0.9503968    1    0
## 45  0.3888889 0.7500000 0.8750000 0.8350794 0.9548611    1    0
## 50  0.3888889 0.7410714 0.8750000 0.8358532 0.9459325    1    0
## 55  0.4166667 0.7162698 0.8750000 0.8368849 0.9627976    1    0
## 100 0.3333333 0.7658730 0.8660714 0.8400198 0.9642857    1    0
## 150 0.2777778 0.7658730 0.8750000 0.8328373 0.9627976    1    0
## 200 0.2777778 0.7777778 0.8750000 0.8367063 0.9642857    1    0
## 250 0.2777778 0.7777778 0.8750000 0.8426984 0.9910714    1    0
## 300 0.2777778 0.7857143 0.8908730 0.8448214 0.9910714    1    0
## 350 0.2777778 0.7812500 0.8819444 0.8417262 0.9910714    1    0
## 400 0.2777778 0.7797619 0.8908730 0.8436905 0.9910714    1    0
## 
## Sens 
##     Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
## 10  0.25 0.5000000 0.6666667 0.6516667    0.75    1    0
## 15  0.00 0.5000000 0.6666667 0.6066667    0.75    1    0
## 20  0.00 0.3333333 0.6666667 0.6166667    0.75    1    0
## 25  0.00 0.3750000 0.6666667 0.6216667    0.75    1    0
## 30  0.00 0.5000000 0.6666667 0.6333333    0.75    1    0
## 35  0.00 0.3750000 0.6666667 0.6116667    0.75    1    0
## 40  0.00 0.5000000 0.6666667 0.6266667    0.75    1    0
## 45  0.00 0.3750000 0.6666667 0.6183333    0.75    1    0
## 50  0.00 0.5000000 0.6666667 0.6250000    0.75    1    0
## 55  0.00 0.5000000 0.6666667 0.6216667    0.75    1    0
## 100 0.00 0.5000000 0.6666667 0.6416667    0.75    1    0
## 150 0.00 0.5000000 0.6666667 0.6500000    0.75    1    0
## 200 0.00 0.5000000 0.6666667 0.6500000    0.75    1    0
## 250 0.00 0.5000000 0.6666667 0.6500000    0.75    1    0
## 300 0.00 0.5000000 0.6666667 0.6450000    0.75    1    0
## 350 0.00 0.5000000 0.6666667 0.6466667    0.75    1    0
## 400 0.00 0.5000000 0.6666667 0.6416667    0.75    1    0
## 
## Spec 
##          Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## 10  0.5000000 0.7142857 0.8333333 0.8238095 0.9642857    1    0
## 15  0.5000000 0.7142857 0.8452381 0.8447619 1.0000000    1    0
## 20  0.5000000 0.7440476 0.8571429 0.8514286 1.0000000    1    0
## 25  0.6666667 0.8333333 0.8571429 0.8828571 1.0000000    1    0
## 30  0.6666667 0.8333333 0.8571429 0.8714286 1.0000000    1    0
## 35  0.6666667 0.8333333 0.8571429 0.8676190 1.0000000    1    0
## 40  0.5000000 0.8333333 0.8571429 0.8552381 1.0000000    1    0
## 45  0.6666667 0.8333333 0.8571429 0.8685714 1.0000000    1    0
## 50  0.6666667 0.8333333 0.8571429 0.8619048 1.0000000    1    0
## 55  0.6666667 0.8333333 0.8571429 0.8776190 1.0000000    1    0
## 100 0.5000000 0.8333333 0.9285714 0.8833333 1.0000000    1    0
## 150 0.5000000 0.8333333 0.9285714 0.8923810 1.0000000    1    0
## 200 0.5000000 0.8333333 0.9285714 0.8957143 1.0000000    1    0
## 250 0.5000000 0.8333333 0.9285714 0.8923810 1.0000000    1    0
## 300 0.5000000 0.8333333 0.9285714 0.8957143 1.0000000    1    0
## 350 0.5000000 0.8333333 0.9285714 0.8923810 1.0000000    1    0
## 400 0.5000000 0.8333333 0.9285714 0.8957143 1.0000000    1    0

extract the best number of trees and identify the best features for the random forest.

set.seed(1234)

fit_rf <- train(sample~.,
                data= train,
                method = "rf",
                metric = "ROC",
                tuneGrid = tuneGrid,
                trControl = trControl,
                importance = TRUE,
                nodesize = 14,
                ntree = 55,
                maxnodes = 6)

print(fit_rf$bestTune)
##   mtry
## 1   12
res<-fit_rf$results

Make predictions using the test data set

probs_test <- predict(fit_rf,test,type="prob")

test$sample = as.factor(test$sample)

gbm.ROC_test<- roc(predictor=probs_test$steatosis,
                  response=test$sample)
## Setting levels: control = control, case = steatosis
## Setting direction: controls < cases
table(test$sample)
## 
##   control steatosis 
##        16        27
print(gbm.ROC_test[["auc"]])
## Area under the curve: 0.919

Repeat the steps for the validation set

probs_val <- predict(fit_rf,validation,type="prob")

validation$sample=as.factor(validation$sample)

gbm.ROC_val<- roc(predictor=probs_val$steatosis,
               response=validation$sample)
## Setting levels: control = normal, case = steatosis
## Setting direction: controls < cases
table(validation$sample)
## 
##    normal steatosis 
##        13        19
print(gbm.ROC_val[["auc"]])
## Area under the curve: 0.7308
print(colnames(validation))
##  [1] "sample"   "GAPDH"    "BTK"      "PTPN6"    "SERPING1" "BAX"     
##  [7] "ITGAE"    "IL1RAP"   "MSR1"     "TNFRSF14" "IL15"     "B2M"     
## [13] "TLR1"     "HPRT1"    "CX3CR1"   "TOLLIP"   "C9"       "IFIH1"   
## [19] "C4BPA"

plot the ROC curve

plot(gbm.ROC_test,col ="darkred",main="Random Forest ROC curve",
     col.lab="black", cex.lab=1.5)
text(x = 0.7445764,y =0.8701577,label="AUC:0.91",cex=1)

plot(gbm.ROC_val,col ="darkgreen",add=T)
text(x = 0.3274354,y = 0.7076923,label="AUC:0.73",cex=1)

legend(x = "bottomright", 
       c('Test-NC:16,NS:27','Validation-NC:13,NS:19','NC:Number of control','NS:Number of steatosis'),lty=c(1,1),
       lwd=c(2,2),col=c('darkred','darkgreen','white','white'))

lets do the jitter plot for train and validation test.

train_data_plot<-as.data.frame(train)

train_data_plot$sample=as.factor(train_data_plot$sample)

a<-compare_means(c(C9,HPRT1,TLR1,B2M,BAX,GAPDH,BTK,PTPN6,SERPING1,ITGAE,IL1RAP,MSR1,TNFRSF14,IL15,CX3CR1,TOLLIP,IFIH1,C4BPA) ~ sample ,data = train_data_plot)
my_gene_list <- c("C9","HPRT1","TLR1","B2M","BAX","GAPDH","BTK","PTPN6","SERPING1","ITGAE","IL1RAP","MSR1","TNFRSF14","IL15","CX3CR1","TOLLIP","IFIH1","C4BPA")

 my_plot_list <- vector(mode = "list", length = 18)

for (i in 1:18) {
  my_comparisons <- list( c("Steatosis", "Control") )
  p <-ggboxplot(train_data_plot, x = "sample", y = my_gene_list[i],
          color = "sample", palette = "jco",add = "jitter")+
          stat_compare_means(label.y = 16)
  my_plot_list[[i]] <- p
}
 
print(my_plot_list[1:18])
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

Validation Set

validation_data_plot<-as.data.frame(validation)

validation_data_plot$sample=as.factor(validation_data_plot$sample)

compare_means(c(BTK,PTPN6,SERPING1,ITGAE,IL1RAP,MSR1,TNFRSF14,IL15,CX3CR1,TOLLIP,IFIH1,C4BPA) ~ sample ,data = validation_data_plot)
## # A tibble: 12 x 8
##    .y.      group1 group2            p   p.adj p.format p.signif method  
##    <fct>    <chr>  <chr>         <dbl>   <dbl> <chr>    <chr>    <chr>   
##  1 BTK      normal steatosis 0.00593   0.0059  0.00593  **       Wilcoxon
##  2 PTPN6    normal steatosis 0.650     0.65    0.64982  ns       Wilcoxon
##  3 SERPING1 normal steatosis 0.120     0.12    0.12013  ns       Wilcoxon
##  4 ITGAE    normal steatosis 0.000143  0.00014 0.00014  ***      Wilcoxon
##  5 IL1RAP   normal steatosis 0.00103   0.001   0.00103  **       Wilcoxon
##  6 MSR1     normal steatosis 0.00166   0.0017  0.00166  **       Wilcoxon
##  7 TNFRSF14 normal steatosis 0.000744  0.00074 0.00074  ***      Wilcoxon
##  8 IL15     normal steatosis 0.426     0.43    0.42591  ns       Wilcoxon
##  9 CX3CR1   normal steatosis 0.000528  0.00053 0.00053  ***      Wilcoxon
## 10 TOLLIP   normal steatosis 0.0000497 0.00005 5e-05    ****     Wilcoxon
## 11 IFIH1    normal steatosis 0.000744  0.00074 0.00074  ***      Wilcoxon
## 12 C4BPA    normal steatosis 0.00354   0.0035  0.00354  **       Wilcoxon

plot the bar graph for validation set

my_plot_list_2 <- vector(mode = "list", length = 18)

for (i in 1:18) {
  my_comparisons <- list( c("Steatosis", "Control") )
  p <-ggboxplot(validation_data_plot, x = "sample", y = my_gene_list[i],
          color = "sample", palette = "jco",add = "jitter")+
          stat_compare_means(label.y = 16)
  my_plot_list_2[[i]] <- p
}
 
print(my_plot_list_2[1:18])
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

Enricher

setEnrichrSite("Enrichr") # Human genes

websiteLive <- TRUE

dbs <- listEnrichrDbs()

dbs <- c("GO_Molecular_Function_2021", "GO_Cellular_Component_2021", "GO_Biological_Process_2021",
         "Reactome_2021","KEGG_2021_Human")
genes_name<-colnames(train)

genes_name<-genes_name[2:nrow(train)]

if (websiteLive) {
  enriched <- enrichr(genes_name, dbs)
}
## Uploading data to Enrichr... Done.
##   Querying GO_Molecular_Function_2021... Done.
##   Querying GO_Cellular_Component_2021... Done.
##   Querying GO_Biological_Process_2021... Done.
##   Querying Reactome_2021... Done.
##   Querying KEGG_2021_Human... Done.
## Parsing results... Done.
if (websiteLive) datatable(enriched[["KEGG_2021_Human"]])
if (websiteLive) plotEnrich(enriched[[5]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value",title = "Enrichment analysis")

make the correlation

steatosis<-'steatosis'

steatosis_samples<-train[train$sample %in% steatosis, ]  

steatosis_samples<-steatosis_samples[,-c(1)]

steatosis_samples <- cor(steatosis_samples)

corrplot(steatosis_samples,method="circle",cl.lim=c(-1,1),col=colorRampPalette(c("blue","white","red"))(200))

control<-'control'

control_samples<-train[train$sample %in% control, ]  

control_samples<-control_samples[,-c(1)]

control_samples <- cor(control_samples)

corrplot(control_samples,method="circle",cl.lim=c(-1,1),col=colorRampPalette(c("blue","white","red"))(200))

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19042)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tibble_3.1.1        corrplot_0.89       DT_0.18            
##  [4] enrichR_3.0         ggsignif_0.6.1      ggpubr_0.4.0       
##  [7] pROC_1.17.0.1       dplyr_1.0.6         alookr_0.3.4       
## [10] e1071_1.7-7         caret_6.0-88        ggplot2_3.3.3      
## [13] lattice_0.20-41     randomForest_4.6-14
## 
## loaded via a namespace (and not attached):
##   [1] mlr_2.19.0           readxl_1.3.1         backports_1.2.1     
##   [4] fastmatch_1.1-0      Hmisc_4.5-0          systemfonts_1.0.2   
##   [7] plyr_1.8.6           lazyeval_0.2.2       splines_4.0.3       
##  [10] crosstalk_1.1.1      digest_0.6.27        foreach_1.5.1       
##  [13] htmltools_0.5.1.1    fansi_0.4.2          magrittr_2.0.1      
##  [16] checkmate_2.0.0      BBmisc_1.11          unbalanced_2.0      
##  [19] doParallel_1.0.16    cluster_2.1.0        openxlsx_4.2.3      
##  [22] recipes_0.1.16       gower_0.2.2          extrafont_0.17      
##  [25] sandwich_3.0-1       extrafontdb_1.0      svglite_2.0.0       
##  [28] jpeg_0.1-8.1         colorspace_2.0-1     rvest_1.0.0         
##  [31] ggrepel_0.9.1        haven_2.4.1          xfun_0.23           
##  [34] crayon_1.4.1         jsonlite_1.7.2       libcoin_1.0-8       
##  [37] survival_3.2-7       zoo_1.8-9            iterators_1.0.13    
##  [40] glue_1.4.2           kableExtra_1.3.4     gtable_0.3.0        
##  [43] ipred_0.9-11         webshot_0.5.2        car_3.0-10          
##  [46] Rttf2pt1_1.3.8       abind_1.4-5          scales_1.1.1        
##  [49] mvtnorm_1.1-2        DBI_1.1.1            rstatix_0.7.0       
##  [52] Rcpp_1.0.6           viridisLite_0.4.0    htmlTable_2.2.1     
##  [55] foreign_0.8-80       proxy_0.4-26         Formula_1.2-4       
##  [58] stats4_4.0.3         lava_1.6.9           prodlim_2019.11.13  
##  [61] htmlwidgets_1.5.3    httr_1.4.2           FNN_1.1.3           
##  [64] RColorBrewer_1.1-2   ellipsis_0.3.2       farver_2.1.0        
##  [67] ParamHelpers_1.14    pkgconfig_2.0.3      nnet_7.3-14         
##  [70] sass_0.4.0           utf8_1.2.1           labeling_0.4.2      
##  [73] tidyselect_1.1.1     rlang_0.4.10         reshape2_1.4.4      
##  [76] munsell_0.5.0        cellranger_1.1.0     tools_4.0.3         
##  [79] xgboost_1.4.1.1      cli_2.5.0            generics_0.1.0      
##  [82] ranger_0.12.1        broom_0.7.6          evaluate_0.14       
##  [85] stringr_1.4.0        yaml_2.2.1           ModelMetrics_1.2.2.2
##  [88] knitr_1.33           zip_2.1.1            ggmosaic_0.3.3      
##  [91] caTools_1.18.2       RANN_2.6.1           purrr_0.3.4         
##  [94] nlme_3.1-149         xml2_1.3.2           compiler_4.0.3      
##  [97] rstudioapi_0.13      plotly_4.9.4         curl_4.3.1          
## [100] png_0.1-7            bslib_0.2.5.1        stringi_1.5.3       
## [103] highr_0.9            forcats_0.5.1        gdtools_0.2.3       
## [106] hrbrthemes_0.8.0     Matrix_1.2-18        dlookr_0.4.5        
## [109] ggsci_2.9            vctrs_0.3.8          RcmdrMisc_2.7-1     
## [112] pillar_1.6.1         lifecycle_1.0.0      jquerylib_0.1.4     
## [115] data.table_1.14.0    bitops_1.0-7         R6_2.5.0            
## [118] latticeExtra_0.6-29  gridExtra_2.3        rio_0.5.26          
## [121] codetools_0.2-16     MASS_7.3-53          assertthat_0.2.1    
## [124] rjson_0.2.20         withr_2.4.2          nortest_1.0-4       
## [127] parallel_4.0.3       hms_1.1.0            grid_4.0.3          
## [130] prettydoc_0.4.1      rpart_4.1-15         timeDate_3043.102   
## [133] tidyr_1.1.3          class_7.3-17         rmarkdown_2.8       
## [136] inum_1.0-4           carData_3.0-4        parallelMap_1.5.0   
## [139] partykit_1.2-13      lubridate_1.7.10     base64enc_0.1-3